library(UKgrid)
library(TSstudio)
library(dplyr)
library(forecast)
library(lubridate)
library(plotly)Laboratório: UKgrid
Utilizaremos o conjunto de dados
UKgrid do pacote UKgrid do R para ilustrar como podemos utilizar o modelo de regressão linear para lidar com problemas de multiplos períodos sazonais. Nosso objetivo é fazer previsão para os próximos dias
Pacotes
Dados
data("UKgrid")
glimpse(UKgrid)Rows: 254,592
Columns: 9
$ TIMESTAMP <dttm> 2005-04-01 00:00:00, 2005-04-01 00:30:00, 2…
$ ND <int> 32926, 32154, 33633, 34574, 34720, 34452, 33…
$ I014_ND <int> 32774, 32032, 33495, 34460, 34641, 34347, 33…
$ TSD <int> 34286, 33885, 35082, 36028, 36179, 35921, 35…
$ ENGLAND_WALES_DEMAND <int> 29566, 28871, 30340, 31253, 31325, 31094, 30…
$ EMBEDDED_WIND_GENERATION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_WIND_CAPACITY <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_SOLAR_GENERATION <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ EMBEDDED_SOLAR_CAPACITY <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
UKdaily <- extract_grid(type = "data.frame", columns = "ND", aggregate = "daily")
head(UKdaily) TIMESTAMP ND
1 2005-04-01 1920069
2 2005-04-02 1674699
3 2005-04-03 1631352
4 2005-04-04 1916693
5 2005-04-05 1952082
6 2005-04-06 1964584
ts_plot(UKdaily, title = "Demanda Nacional Diária de Energia no Reino Unido", Ytitle = "MW", Xtitle = "Ano")EDA
- Tendência?
- Sazonalidade?
- Qual o período sazonal?
- exisem outros possíveis períodos sazonais?
ts_heatmap(UKdaily[which(year(UKdaily$TIMESTAMP) >= 2015),],
title = "Mapa de calor da demanda nacional diária (detrended) de energia no Reino Unido")UKdaily_df <- UKdaily |>
mutate(weekday = wday(TIMESTAMP, label = TRUE),
month = month(TIMESTAMP, label = TRUE),
lag365 = dplyr::lag(ND, 365))
UKdaily_summarise <- UKdaily_df |> group_by(weekday, month) |>
summarise(Media = mean(ND, na.rm = TRUE))
plot_ly(data = UKdaily_summarise, x = ~month, y = ~Media, type =
"bar",color = ~weekday) |>
layout(title = "Demanda média diária de energia por mês")str(UKdaily_df)'data.frame': 5304 obs. of 5 variables:
$ TIMESTAMP: Date, format: "2005-04-01" "2005-04-02" ...
$ ND : int 1920069 1674699 1631352 1916693 1952082 1964584 1990895 2003982 1811436 1684720 ...
$ weekday : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 6 7 1 2 3 4 5 6 7 1 ...
$ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 4 4 4 4 4 4 4 4 ...
$ lag365 : int NA NA NA NA NA NA NA NA NA NA ...
start_date <- min(UKdaily_df$TIMESTAMP)
start <- c(year(start_date), yday(start_date))
UK_ts <- ts(UKdaily_df$ND, start = start, frequency = 365)
acf(UK_ts, lag.max = 365*6)
Modelo
Apenas para poder comparar observados e estimado, vamos dividir a série:
h <- 365
UKpartitions <- ts_split(UK_ts, sample.out = h)
train_ts <- UKpartitions$train
test_ts <- UKpartitions$test
train_df <- UKdaily_df[1:(nrow(UKdaily_df) - h), ]
test_df <- UKdaily_df[(nrow(UKdaily_df) - h + 1):nrow(UKdaily_df), ]modelo_tslm <- tslm(train_ts ~ season + trend + weekday + month , data = train_df)
fore <- forecast(modelo_tslm, h = h, newdata = test_df)test_forecast(actual = UK_ts, forecast.obj = fore, test = test_ts)accuracy(fore , test_ts) ME RMSE MAE MPE MAPE MASE
Training set -7.233089e-11 70611.20 52773.55 -0.1675864 3.174944 0.4396604
Test set -1.891129e+04 82845.32 66699.81 -1.4303754 4.771410 0.5556811
ACF1 Theil's U
Training set 0.7539041 NA
Test set 0.6152629 0.6998989
modelo_tslm_2 <- tslm(train_ts ~ trend + weekday + month , data = train_df)
fore_2 <- forecast(modelo_tslm_2, h = h, newdata = test_df)
accuracy(fore_2 , test_ts) ME RMSE MAE MPE MAPE MASE
Training set -5.226344e-11 87550.26 62678.62 -0.2555417 3.756523 0.5221803
Test set -1.833883e+04 95112.70 74873.84 -1.5121833 5.331113 0.6237796
ACF1 Theil's U
Training set 0.8035804 NA
Test set 0.7091169 0.8029323
modelo_tslm_3 <- tslm(train_ts ~ season + trend + weekday + month + lag365, data = train_df)
fore_3 <- forecast(modelo_tslm_3, h = h, newdata = test_df)
accuracy(fore_3 , test_ts) ME RMSE MAE MPE MAPE MASE
Training set -7.603646e-11 69904.92 52006.75 -0.1717563 3.163385 0.4359116
Test set -1.754061e+04 81783.55 65957.66 -1.3613252 4.722083 0.5528457
ACF1 Theil's U
Training set 0.7500146 NA
Test set 0.6094666 0.6925767
Modelo para produçãp
modelo_final <- tslm(UK_ts ~ season + trend + weekday + month + lag365, data = UKdaily_df)
checkresiduals(modelo_final, plot = TRUE)
Breusch-Godfrey test for serial correlation of order up to 730
data: Residuals from Linear regression model
LM test = 3301.1, df = 730, p-value < 2.2e-16
O modelo não captura apropriadamente a dinâmica dos dados. Algumas alternativas são incluir variáveis exógenas ou, após utilizar o modelo de regressão, modelar os resíduos por um modelo da ARIMA